#coding:utf-8
from spider import DataBase
import codecs

class Handle(DataBase):
    def open_file(self):
        self.fp = codecs.open('sougou.tgtxt', 'a', 'utf-8')
    def close_file(self):
        self.fp.close()
    def to_utf8(self, data):
        return data.decode('gbk', 'ignore')
    def db_count(self, tbname):
        sql = 'SELECT COUNT(*) FROM %s' % tbname
        for r in self.select(sql):
            return r[0]
    def db_get(self, tbname, begin, end):
        '''id >= begin && id < end'''
        sql = 'SELECT id,url,docno,title,content FROM %s WHERE id>=%s and id<%s;' % (
            tbname, begin, end)
        return self.select(sql)
    def start(self, tbname, step=10, end=-1):
        self.open_file()
        #
        begin = 1
        #
        last_index = begin
        if end==-1:
            end = self.db_count(tbname)
        for i in range(begin+step, end+step, step):
            print('%s-%s' % (last_index, i))
            for r in self.db_get(tbname, last_index, i):
                self.handle(r)
            last_index = i
        #
        self.close_file()
    def handle(self, info):
        title = self.to_utf8(info[3])
        content = self.to_utf8(info[4])
        self.save(info[0], info[1], info[2], title, content)
    def save(self, id, url, docno, title, content):
        data = '\n_tgtxt_page_\n\nsublib:%s\ntitle:%s\nurl:%s\n%s\n' % (
            'sougou', title, url, content)
        self.fp.write(data)
#--------------------#
handle = Handle('localhost', 'dzlua', 'dzlua', 'spider')
handle.start('sougou_data', 100, -1)
handle.close()
